Assignment: Build a Logistic Regression Model for Diabetes Prediction¶
- Objective: The objective of this assignment is to build a predictive model to predict the likelihood of a patient having diabetes based on certain features.
01. DATA PREPARATION¶
a. Load the dataset, Explore the dataset to understand its structure and contents
#1. import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
#2. load the dataset
diabetes_df = pd.read_csv('/content/diabetes2.csv')
#3. Display the first 5 rows of the DataFrame
diabetes_df.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
#4. Display the last 5 rows of the DataFrame
diabetes_df.tail()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 763 | 10 | 101 | 76 | 48 | 180 | 32.9 | 0.171 | 63 | 0 |
| 764 | 2 | 122 | 70 | 27 | 0 | 36.8 | 0.340 | 27 | 0 |
| 765 | 5 | 121 | 72 | 23 | 112 | 26.2 | 0.245 | 30 | 0 |
| 766 | 1 | 126 | 60 | 0 | 0 | 30.1 | 0.349 | 47 | 1 |
| 767 | 1 | 93 | 70 | 31 | 0 | 30.4 | 0.315 | 23 | 0 |
#5. dataset shape
diabetes_df.shape
(768, 9)
#6. dataset columns info
diabetes_df.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
#7. Print information about the DataFrame
diabetes_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
#8. Get descriptive analysis
diabetes_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Pregnancies | 768.0 | 3.845052 | 3.369578 | 0.000 | 1.00000 | 3.0000 | 6.00000 | 17.00 |
| Glucose | 768.0 | 120.894531 | 31.972618 | 0.000 | 99.00000 | 117.0000 | 140.25000 | 199.00 |
| BloodPressure | 768.0 | 69.105469 | 19.355807 | 0.000 | 62.00000 | 72.0000 | 80.00000 | 122.00 |
| SkinThickness | 768.0 | 20.536458 | 15.952218 | 0.000 | 0.00000 | 23.0000 | 32.00000 | 99.00 |
| Insulin | 768.0 | 79.799479 | 115.244002 | 0.000 | 0.00000 | 30.5000 | 127.25000 | 846.00 |
| BMI | 768.0 | 31.992578 | 7.884160 | 0.000 | 27.30000 | 32.0000 | 36.60000 | 67.10 |
| DiabetesPedigreeFunction | 768.0 | 0.471876 | 0.331329 | 0.078 | 0.24375 | 0.3725 | 0.62625 | 2.42 |
| Age | 768.0 | 33.240885 | 11.760232 | 21.000 | 24.00000 | 29.0000 | 41.00000 | 81.00 |
| Outcome | 768.0 | 0.348958 | 0.476951 | 0.000 | 0.00000 | 0.0000 | 1.00000 | 1.00 |
b. Data cleaning and perform preprocessing
#1. check for missing values
diabetes_df.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
#2. Removecheck for duplicates
diabetes_df.duplicated().sum()
0
#3. Dividing features into Numerical and Categorical
col = list(diabetes_df.columns)
categorical_features = []
numerical_features = []
for i in col:
if len(diabetes_df[i].unique()) > 6:
numerical_features.append(i)
else:
categorical_features.append(i)
print('Categorical Features :',*categorical_features)
print('Numerical Features :',*numerical_features)
Categorical Features : Outcome Numerical Features : Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
#4. Plotting histograms to see the frequency distribution
diabetes_df.hist(bins=30, figsize=(20,15))
plt.show()
There is an issue with this dataset, as certain numerical values, such as BP, skin thickness, etc., cannot realistically be zero.
Therefore replace missing values with the mean.
#5. Replacing the zero values with mean
diabetes_df['Glucose'] = diabetes_df['Glucose'].replace(0, diabetes_df['Glucose'].mean())
diabetes_df['BMI'] = diabetes_df['BMI'].replace(0, diabetes_df['BMI'].mean())
diabetes_df['BloodPressure'] = diabetes_df['BloodPressure'].replace(0, diabetes_df['BloodPressure'].mean())
diabetes_df['SkinThickness'] = diabetes_df['SkinThickness'].replace(0, diabetes_df['SkinThickness'].mean())
diabetes_df['Insulin'] = diabetes_df['Insulin'].replace(0, diabetes_df['Insulin'].mean())
#6. check statistics again to make sure the replace of values
diabetes_df.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Pregnancies | 768.0 | 3.845052 | 3.369578 | 0.000 | 1.000000 | 3.000000 | 6.00000 | 17.00 |
| Glucose | 768.0 | 121.681605 | 30.436016 | 44.000 | 99.750000 | 117.000000 | 140.25000 | 199.00 |
| BloodPressure | 768.0 | 72.254807 | 12.115932 | 24.000 | 64.000000 | 72.000000 | 80.00000 | 122.00 |
| SkinThickness | 768.0 | 26.606479 | 9.631241 | 7.000 | 20.536458 | 23.000000 | 32.00000 | 99.00 |
| Insulin | 768.0 | 118.660163 | 93.080358 | 14.000 | 79.799479 | 79.799479 | 127.25000 | 846.00 |
| BMI | 768.0 | 32.450805 | 6.875374 | 18.200 | 27.500000 | 32.000000 | 36.60000 | 67.10 |
| DiabetesPedigreeFunction | 768.0 | 0.471876 | 0.331329 | 0.078 | 0.243750 | 0.372500 | 0.62625 | 2.42 |
| Age | 768.0 | 33.240885 | 11.760232 | 21.000 | 24.000000 | 29.000000 | 41.00000 | 81.00 |
| Outcome | 768.0 | 0.348958 | 0.476951 | 0.000 | 0.000000 | 0.000000 | 1.00000 | 1.00 |
#7.Create heatmap to find co-realtion
plt.figure(figsize=(10, 8))
sns.heatmap(diabetes_df.corr(),cmap = "coolwarm", annot= True)
plt.show()
#8. check correaltion values for the dataset
diabetes_df.corr()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| Pregnancies | 1.000000 | 0.127964 | 0.208984 | 0.013376 | -0.018082 | 0.021546 | -0.033523 | 0.544341 | 0.221898 |
| Glucose | 0.127964 | 1.000000 | 0.219666 | 0.160766 | 0.396597 | 0.231478 | 0.137106 | 0.266600 | 0.492908 |
| BloodPressure | 0.208984 | 0.219666 | 1.000000 | 0.134155 | 0.010926 | 0.281231 | 0.000371 | 0.326740 | 0.162986 |
| SkinThickness | 0.013376 | 0.160766 | 0.134155 | 1.000000 | 0.240361 | 0.535703 | 0.154961 | 0.026423 | 0.175026 |
| Insulin | -0.018082 | 0.396597 | 0.010926 | 0.240361 | 1.000000 | 0.189856 | 0.157806 | 0.038652 | 0.179185 |
| BMI | 0.021546 | 0.231478 | 0.281231 | 0.535703 | 0.189856 | 1.000000 | 0.153508 | 0.025748 | 0.312254 |
| DiabetesPedigreeFunction | -0.033523 | 0.137106 | 0.000371 | 0.154961 | 0.157806 | 0.153508 | 1.000000 | 0.033561 | 0.173844 |
| Age | 0.544341 | 0.266600 | 0.326740 | 0.026423 | 0.038652 | 0.025748 | 0.033561 | 1.000000 | 0.238356 |
| Outcome | 0.221898 | 0.492908 | 0.162986 | 0.175026 | 0.179185 | 0.312254 | 0.173844 | 0.238356 | 1.000000 |
#9. Univariate Analysis - Target Variable(outcome) vs Features - SCATTER PLOT
# Set custom colors
colors = ['#446BAD', '#A2A2A2']
# Plot scatter plots for each numerical feature against Outcome
a = 0
fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))
target_variable = 'Outcome'
for i in range(len(numerical_features)):
a += 1
plt.subplot(4, 2, a)
sns.scatterplot(x=numerical_features[i], y=target_variable, data=diabetes_df, hue=target_variable, palette=colors, edgecolor='black')
plt.legend(['Non-Diabetes', 'Diabetes'])
title = f'{numerical_features[i]} vs {target_variable}'
plt.title(title)
plt.tight_layout()
plt.show()
#10. Univariate Analysis - Target Variable(outcome) vs Features - HISTOGRAM
import matplotlib.pyplot as plt
import seaborn as sns
# Set custom colors
colors = ['#446BAD', '#A2A2A2']
# Plot histograms for each numerical feature against Outcome
a = 0
fig, ax = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))
target_variable = 'Outcome'
for i in range(len(numerical_features)):
a += 1
plt.subplot(4, 2, a)
sns.histplot(x=numerical_features[i], data=diabetes_df, hue=target_variable, multiple="stack", palette=colors, edgecolor='black', shrink=0.8)
plt.legend(['Non-Diabetes', 'Diabetes'])
title = f'{numerical_features[i]} vs {target_variable}'
plt.title(title)
plt.tight_layout()
plt.show()
#11. Numerical features vs Numerical features w.r.t Target variable (Outcome)
colors = ['#446BAD','#A2A2A2']
a = 0
fig,ax = plt.subplots(nrows = 14,ncols = 2,figsize = (35,70))
for i in range(len(numerical_features)):
for j in range(len(numerical_features)):
if i != j and j > i:
a += 1
plt.subplot(14,2,a)
sns.scatterplot(x = numerical_features[i],y = numerical_features[j],data = diabetes_df,hue = 'Outcome',palette = colors,edgecolor = 'black');
plt.legend(['Non-Diabetes','Diabetes'],loc = 'upper left',)
title = numerical_features[i] + ' vs ' + numerical_features[j]
plt.title(title)
#12. Counting the values for both categories
diabetes_df['Outcome'].value_counts()
0 500 1 268 Name: Outcome, dtype: int64
#13. Check outliers - create box plots
plt.figure(figsize=(15, 5))
sns.boxplot(data=diabetes_df)
plt.title('Boxplots Before Outlier Handling')
plt.show()
Using initial dataset without oversampling- build a Logistic Regression Model¶
c. Split the data into a training set and a test set
#1. Define variables
X = diabetes_df.drop(columns=['Outcome']) # input variables
y = diabetes_df['Outcome'] # Output variable
#2. Split the data into a training set and a test set (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shape of the resulting sets
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
Shape of X_train: (614, 8) Shape of X_test: (154, 8) Shape of y_train: (614,) Shape of y_test: (154,)
#3. Standarization of data
scaler = StandardScaler()
# scaling for independent variables
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)
#4. with scaling
# Convert scaled data back to DataFrame with column names
X_train_sc_df = pd.DataFrame(X_train_sc, columns=X.columns)
# Boxplot with scaled data
plt.figure(figsize=(12, 6))
sns.boxplot(data=X_train_sc_df)
plt.show()
2. Model Training¶
**a. Use Logistic Regression model to predict the relationship between the independent variables- all numerical features and the dependent variable - categorical value( diabetes=0 and non-diabetics= 1)
# 1. Create and fit the multiple linear regression model to the training set
clf_model = LogisticRegression()
clf_model.fit(X_train_sc, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# 2. Find probabilities on the test set
clf_model.predict_proba(X_test_sc)
array([[0.74085216, 0.25914784],
[0.8272927 , 0.1727073 ],
[0.89038968, 0.10961032],
[0.84852405, 0.15147595],
[0.53605745, 0.46394255],
[0.58071201, 0.41928799],
[0.98634383, 0.01365617],
[0.59225165, 0.40774835],
[0.42899498, 0.57100502],
[0.22818685, 0.77181315],
[0.77335102, 0.22664898],
[0.10157941, 0.89842059],
[0.64262596, 0.35737404],
[0.70792494, 0.29207506],
[0.92133815, 0.07866185],
[0.61275053, 0.38724947],
[0.88416717, 0.11583283],
[0.92906588, 0.07093412],
[0.25599993, 0.74400007],
[0.40496577, 0.59503423],
[0.81355128, 0.18644872],
[0.92765719, 0.07234281],
[0.50945271, 0.49054729],
[0.90698682, 0.09301318],
[0.44496951, 0.55503049],
[0.10612143, 0.89387857],
[0.89077844, 0.10922156],
[0.97090681, 0.02909319],
[0.74571181, 0.25428819],
[0.89181393, 0.10818607],
[0.07399538, 0.92600462],
[0.13249625, 0.86750375],
[0.19853621, 0.80146379],
[0.30977293, 0.69022707],
[0.4085824 , 0.5914176 ],
[0.2886708 , 0.7113292 ],
[0.03333078, 0.96666922],
[0.78732048, 0.21267952],
[0.53184608, 0.46815392],
[0.47241158, 0.52758842],
[0.93452739, 0.06547261],
[0.42826503, 0.57173497],
[0.46828173, 0.53171827],
[0.69030534, 0.30969466],
[0.9760205 , 0.0239795 ],
[0.465516 , 0.534484 ],
[0.38357121, 0.61642879],
[0.80248085, 0.19751915],
[0.67257987, 0.32742013],
[0.03857973, 0.96142027],
[0.95873364, 0.04126636],
[0.32996087, 0.67003913],
[0.18772931, 0.81227069],
[0.74957294, 0.25042706],
[0.89946934, 0.10053066],
[0.9621589 , 0.0378411 ],
[0.22196141, 0.77803859],
[0.95146481, 0.04853519],
[0.57418062, 0.42581938],
[0.2242079 , 0.7757921 ],
[0.26451922, 0.73548078],
[0.66597974, 0.33402026],
[0.81650894, 0.18349106],
[0.80813316, 0.19186684],
[0.92757938, 0.07242062],
[0.37669377, 0.62330623],
[0.95905146, 0.04094854],
[0.2555028 , 0.7444972 ],
[0.96713622, 0.03286378],
[0.258921 , 0.741079 ],
[0.31252247, 0.68747753],
[0.94363994, 0.05636006],
[0.84837118, 0.15162882],
[0.88671603, 0.11328397],
[0.91656659, 0.08343341],
[0.5446714 , 0.4553286 ],
[0.85867497, 0.14132503],
[0.87480829, 0.12519171],
[0.8663434 , 0.1336566 ],
[0.77363013, 0.22636987],
[0.32471361, 0.67528639],
[0.85403224, 0.14596776],
[0.94512869, 0.05487131],
[0.60680899, 0.39319101],
[0.72873676, 0.27126324],
[0.13344329, 0.86655671],
[0.09566035, 0.90433965],
[0.69089798, 0.30910202],
[0.89502445, 0.10497555],
[0.91742744, 0.08257256],
[0.93716307, 0.06283693],
[0.77818206, 0.22181794],
[0.96485288, 0.03514712],
[0.69043108, 0.30956892],
[0.46680484, 0.53319516],
[0.35449407, 0.64550593],
[0.65330478, 0.34669522],
[0.87771548, 0.12228452],
[0.35079174, 0.64920826],
[0.93426554, 0.06573446],
[0.25621498, 0.74378502],
[0.94122626, 0.05877374],
[0.21933196, 0.78066804],
[0.46924308, 0.53075692],
[0.33537639, 0.66462361],
[0.78129356, 0.21870644],
[0.72605809, 0.27394191],
[0.23070639, 0.76929361],
[0.88180765, 0.11819235],
[0.50643214, 0.49356786],
[0.91474042, 0.08525958],
[0.64940966, 0.35059034],
[0.83504472, 0.16495528],
[0.23993551, 0.76006449],
[0.82434259, 0.17565741],
[0.67601828, 0.32398172],
[0.21774979, 0.78225021],
[0.79036848, 0.20963152],
[0.943055 , 0.056945 ],
[0.65345728, 0.34654272],
[0.94574617, 0.05425383],
[0.70866829, 0.29133171],
[0.77916973, 0.22083027],
[0.92672051, 0.07327949],
[0.72043528, 0.27956472],
[0.61509243, 0.38490757],
[0.70458802, 0.29541198],
[0.11750909, 0.88249091],
[0.06721692, 0.93278308],
[0.25335347, 0.74664653],
[0.26496395, 0.73503605],
[0.13366133, 0.86633867],
[0.91418741, 0.08581259],
[0.56870195, 0.43129805],
[0.15574363, 0.84425637],
[0.89802257, 0.10197743],
[0.84619727, 0.15380273],
[0.13239223, 0.86760777],
[0.18700484, 0.81299516],
[0.98842896, 0.01157104],
[0.91576127, 0.08423873],
[0.96528311, 0.03471689],
[0.79540889, 0.20459111],
[0.60875807, 0.39124193],
[0.88659867, 0.11340133],
[0.75309858, 0.24690142],
[0.88426628, 0.11573372],
[0.98307115, 0.01692885],
[0.60651656, 0.39348344],
[0.2353387 , 0.7646613 ],
[0.89013855, 0.10986145],
[0.5548016 , 0.4451984 ],
[0.76599849, 0.23400151],
[0.82016188, 0.17983812]])
# 3. Make predictions on the test set
y_pred=clf_model.predict(X_test_sc)
y_pred
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])
3. Model Evaluation¶
a. Evaluate the trained Logistic regression model's performance on the test dataset.
# 1. Evaluate the model performance
# Import necessary libraries
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_curve
print('Accuracy_score', accuracy_score(y_test,y_pred))
print('Precision_score', precision_score(y_test,y_pred))
print('Recall_score', recall_score(y_test,y_pred))
print('F1_score', f1_score(y_test,y_pred))
print('Classification_report', classification_report(y_test,y_pred))
print('Confusion_matrix', confusion_matrix(y_test,y_pred))
Accuracy_score 0.7662337662337663
Precision_score 0.6862745098039216
Recall_score 0.6363636363636364
F1_score 0.660377358490566
Classification_report precision recall f1-score support
0 0.81 0.84 0.82 99
1 0.69 0.64 0.66 55
accuracy 0.77 154
macro avg 0.75 0.74 0.74 154
weighted avg 0.76 0.77 0.76 154
Confusion_matrix [[83 16]
[20 35]]
#2. create confusion matrix to identify TP,TN,FP,FN
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Custom color palette
colors = sns.color_palette("Blues")
# Create a bar chart with custom color
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt="d", cmap=colors)
# Labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
# Set tick labels for x and y axes
ax.xaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
ax.yaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
plt.show()
#3. Plot ROC_AUC Curve
# Assuming y_pred_prob is the predicted probabilities for the positive class
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, linewidth=2, label=None)
plt.plot([0, 1], [0, 1], 'k--')
plt.text(0.12, 0.71, "Higher\nthreshold", color="#333333")
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.grid()
plt.axis([0, 1, 0, 1])
plt.legend(loc="lower right", fontsize=13)
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
4. SMOT - Synthetically Minority Oversampling Technique¶
#1. import SMOTE library to do oversampling since the dataset is imbalanced
from imblearn.over_sampling import SMOTE
smote=SMOTE(random_state=42)
#2. Check shape of oversampled dataset
X.sm ,y.sm=smote.fit_resample(X,y)
X.sm.shape, y.sm.shape
<ipython-input-249-d8545b1cad69>:2: UserWarning: Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access X.sm ,y.sm=smote.fit_resample(X,y)
((1000, 8), (1000,))
y.sm.value_counts()
1 500 0 500 Name: Outcome, dtype: int64
#3,. Split the oversampled dataset
X_train, X_test, y_train, y_test = train_test_split(X.sm, y.sm, test_size=0.2, random_state=42)
#4. Do the scaling
scale=StandardScaler()
X_train_sc=scale.fit_transform(X_train)
X_test_sc=scale.transform(X_test)
**a. Use Logistic Regression model to predict the relationship between the independent variables- all numerical features and the dependent variable - categorical value( diabetes=0 and non-diabetics= 1)-- using oversampled dataset
Build a Logistic Regression Model for oversampled dataset ::¶
# 5. Create and fit the logistic regression model to the training set
clf_model2 = LogisticRegression()
clf_model2.fit(X_train_sc, y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
# 6. Find probabilities on the test set
clf_model2.predict_proba(X_test_sc)
array([[0.64416128, 0.35583872],
[0.85519057, 0.14480943],
[0.11930539, 0.88069461],
[0.14098068, 0.85901932],
[0.73412642, 0.26587358],
[0.49250249, 0.50749751],
[0.83706601, 0.16293399],
[0.89461403, 0.10538597],
[0.10273443, 0.89726557],
[0.8401672 , 0.1598328 ],
[0.22007639, 0.77992361],
[0.90917185, 0.09082815],
[0.76511612, 0.23488388],
[0.12916435, 0.87083565],
[0.02698025, 0.97301975],
[0.14764475, 0.85235525],
[0.29892592, 0.70107408],
[0.18751834, 0.81248166],
[0.01941508, 0.98058492],
[0.084372 , 0.915628 ],
[0.11815946, 0.88184054],
[0.92571652, 0.07428348],
[0.40683524, 0.59316476],
[0.84469619, 0.15530381],
[0.93568085, 0.06431915],
[0.04252913, 0.95747087],
[0.52521422, 0.47478578],
[0.57674189, 0.42325811],
[0.05648779, 0.94351221],
[0.15471516, 0.84528484],
[0.40879984, 0.59120016],
[0.77874541, 0.22125459],
[0.80309089, 0.19690911],
[0.20120867, 0.79879133],
[0.05106339, 0.94893661],
[0.49708726, 0.50291274],
[0.81352205, 0.18647795],
[0.68436291, 0.31563709],
[0.31112312, 0.68887688],
[0.70826457, 0.29173543],
[0.67057955, 0.32942045],
[0.88227397, 0.11772603],
[0.42691806, 0.57308194],
[0.20556115, 0.79443885],
[0.52510947, 0.47489053],
[0.87762745, 0.12237255],
[0.37206278, 0.62793722],
[0.89492044, 0.10507956],
[0.51658499, 0.48341501],
[0.33358654, 0.66641346],
[0.89505876, 0.10494124],
[0.08828987, 0.91171013],
[0.34873855, 0.65126145],
[0.37115137, 0.62884863],
[0.58250017, 0.41749983],
[0.41251349, 0.58748651],
[0.98111979, 0.01888021],
[0.67533173, 0.32466827],
[0.15141512, 0.84858488],
[0.63042378, 0.36957622],
[0.6602634 , 0.3397366 ],
[0.17318756, 0.82681244],
[0.47990234, 0.52009766],
[0.04216307, 0.95783693],
[0.11779316, 0.88220684],
[0.36280724, 0.63719276],
[0.14665143, 0.85334857],
[0.40999635, 0.59000365],
[0.6955311 , 0.3044689 ],
[0.84287174, 0.15712826],
[0.36242186, 0.63757814],
[0.37631924, 0.62368076],
[0.09307489, 0.90692511],
[0.71832245, 0.28167755],
[0.1993579 , 0.8006421 ],
[0.59077775, 0.40922225],
[0.51083085, 0.48916915],
[0.29103765, 0.70896235],
[0.98335651, 0.01664349],
[0.79341163, 0.20658837],
[0.91227732, 0.08772268],
[0.72313087, 0.27686913],
[0.57225916, 0.42774084],
[0.36504237, 0.63495763],
[0.59182587, 0.40817413],
[0.19682934, 0.80317066],
[0.60118814, 0.39881186],
[0.85264277, 0.14735723],
[0.21724441, 0.78275559],
[0.0357204 , 0.9642796 ],
[0.81886963, 0.18113037],
[0.55547993, 0.44452007],
[0.63107329, 0.36892671],
[0.76726654, 0.23273346],
[0.47692737, 0.52307263],
[0.12681438, 0.87318562],
[0.51522572, 0.48477428],
[0.09553699, 0.90446301],
[0.50825096, 0.49174904],
[0.63570784, 0.36429216],
[0.09804842, 0.90195158],
[0.50532743, 0.49467257],
[0.63538578, 0.36461422],
[0.64422294, 0.35577706],
[0.43227059, 0.56772941],
[0.82569952, 0.17430048],
[0.84003537, 0.15996463],
[0.03395882, 0.96604118],
[0.6431749 , 0.3568251 ],
[0.41778805, 0.58221195],
[0.78763982, 0.21236018],
[0.40904053, 0.59095947],
[0.23972084, 0.76027916],
[0.29515341, 0.70484659],
[0.32680222, 0.67319778],
[0.93202392, 0.06797608],
[0.46943111, 0.53056889],
[0.80771859, 0.19228141],
[0.79112819, 0.20887181],
[0.31977908, 0.68022092],
[0.56399509, 0.43600491],
[0.09894188, 0.90105812],
[0.78718054, 0.21281946],
[0.08060347, 0.91939653],
[0.05632259, 0.94367741],
[0.76359472, 0.23640528],
[0.51064509, 0.48935491],
[0.00140438, 0.99859562],
[0.387152 , 0.612848 ],
[0.29299785, 0.70700215],
[0.83764375, 0.16235625],
[0.5264694 , 0.4735306 ],
[0.55962434, 0.44037566],
[0.13557461, 0.86442539],
[0.82890672, 0.17109328],
[0.97225042, 0.02774958],
[0.03530205, 0.96469795],
[0.04474738, 0.95525262],
[0.37379433, 0.62620567],
[0.08718522, 0.91281478],
[0.18264806, 0.81735194],
[0.23245496, 0.76754504],
[0.69223996, 0.30776004],
[0.25013034, 0.74986966],
[0.35607424, 0.64392576],
[0.258382 , 0.741618 ],
[0.01688595, 0.98311405],
[0.32648525, 0.67351475],
[0.09571693, 0.90428307],
[0.76074829, 0.23925171],
[0.85127401, 0.14872599],
[0.92632721, 0.07367279],
[0.10783955, 0.89216045],
[0.86585867, 0.13414133],
[0.14343705, 0.85656295],
[0.70833428, 0.29166572],
[0.52416788, 0.47583212],
[0.93629555, 0.06370445],
[0.91057088, 0.08942912],
[0.18673066, 0.81326934],
[0.14091446, 0.85908554],
[0.90077875, 0.09922125],
[0.89437674, 0.10562326],
[0.935203 , 0.064797 ],
[0.27077284, 0.72922716],
[0.64503806, 0.35496194],
[0.38241276, 0.61758724],
[0.1762891 , 0.8237109 ],
[0.31166619, 0.68833381],
[0.89885061, 0.10114939],
[0.1621518 , 0.8378482 ],
[0.4724707 , 0.5275293 ],
[0.19074497, 0.80925503],
[0.67546405, 0.32453595],
[0.47359956, 0.52640044],
[0.10620548, 0.89379452],
[0.61580741, 0.38419259],
[0.03639454, 0.96360546],
[0.09949926, 0.90050074],
[0.8390814 , 0.1609186 ],
[0.92253861, 0.07746139],
[0.07968149, 0.92031851],
[0.80133955, 0.19866045],
[0.96210631, 0.03789369],
[0.05621665, 0.94378335],
[0.89634636, 0.10365364],
[0.11404569, 0.88595431],
[0.6155988 , 0.3844012 ],
[0.17589463, 0.82410537],
[0.21937766, 0.78062234],
[0.03596697, 0.96403303],
[0.27326539, 0.72673461],
[0.93161614, 0.06838386],
[0.47514254, 0.52485746],
[0.25720707, 0.74279293],
[0.02171666, 0.97828334],
[0.03919785, 0.96080215],
[0.85212021, 0.14787979],
[0.59759915, 0.40240085],
[0.33806931, 0.66193069]])
# 7. Make predictions on the test set
y_pred=clf_model2.predict(X_test_sc)
y_pred
array([0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0,
1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0,
0, 1])
Evaluate the trained Logistic regression model's performance on the test dataset.
# 8. Evaluate the model performance
# Import necessary libraries
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score, roc_curve
print('Accuracy_score', accuracy_score(y_test,y_pred))
print('Precision_score', precision_score(y_test,y_pred))
print('Recall_score', recall_score(y_test,y_pred))
print('F1_score', f1_score(y_test,y_pred))
print('Classification_report', classification_report(y_test,y_pred))
print('Confusion_matrix', confusion_matrix(y_test,y_pred))
Accuracy_score 0.765
Precision_score 0.7547169811320755
Recall_score 0.7920792079207921
F1_score 0.7729468599033816
Classification_report precision recall f1-score support
0 0.78 0.74 0.76 99
1 0.75 0.79 0.77 101
accuracy 0.77 200
macro avg 0.77 0.76 0.76 200
weighted avg 0.77 0.77 0.76 200
Confusion_matrix [[73 26]
[21 80]]
#9. create confusion matrix to identify TP,TN,FP,FN
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Custom color palette
colors = sns.color_palette("Blues")
# Create a bar chart with custom color
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt="d", cmap=colors)
# Labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
# Set tick labels for x and y axes
ax.xaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
ax.yaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
plt.show()
#10. Plot ROC_AUC Curve
# Assuming y_pred_prob is the predicted probabilities for the positive class
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
plt.plot(fpr, tpr, linewidth=2, label=None)
plt.plot([0, 1], [0, 1], 'k--')
plt.text(0.12, 0.71, "Higher\nthreshold", color="#333333")
plt.xlabel('False Positive Rate (Fall-Out)')
plt.ylabel('True Positive Rate (Recall)')
plt.grid()
plt.axis([0, 1, 0, 1])
plt.legend(loc="lower right", fontsize=13)
plt.show()
WARNING:matplotlib.legend:No artists with labels found to put in legend. Note that artists whose label start with an underscore are ignored when legend() is called with no argument.
5. Tuning the model with best hyperparameters using same oversampled split dataset¶
from sklearn.model_selection import GridSearchCV
#1. Create logistic regression model
tuned_clf_model = LogisticRegression()
# Create logistic regression model
tuned_clf_model = LogisticRegression()
# Define the hyperparameter grid for Logistic regression
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(tuned_clf_model, param_grid, cv=2, scoring='neg_mean_squared_error')
grid_search.fit(X_train_sc, y_train) # Use scaled features
# Get the best hyperparameters
best_C = grid_search.best_params_['C']
# Fit the Logistic regression model with the best hyperparameters to the scaled training data
tuned_model = LogisticRegression(C=best_C)
tuned_model.fit(X_train_sc, y_train) # Use scaled features
LogisticRegression(C=0.01)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression(C=0.01)
# Make predictions on the test set
y_pred = tuned_model.predict(X_test_sc)
# 2. Evaluate the model performance
# Import necessary libraries
print('Accuracy_score', accuracy_score(y_test,y_pred))
print('Precision_score', precision_score(y_test,y_pred))
print('Recall_score', recall_score(y_test,y_pred))
print('F1_score', f1_score(y_test,y_pred))
print('Classification_report', classification_report(y_test,y_pred))
print('Confusion_matrix', confusion_matrix(y_test,y_pred))
Accuracy_score 0.765
Precision_score 0.7596153846153846
Recall_score 0.7821782178217822
F1_score 0.7707317073170732
Classification_report precision recall f1-score support
0 0.77 0.75 0.76 99
1 0.76 0.78 0.77 101
accuracy 0.77 200
macro avg 0.77 0.76 0.76 200
weighted avg 0.77 0.77 0.76 200
Confusion_matrix [[74 25]
[22 79]]
#3. create confusion matrix to identify TP,TN,FP,FN
# Get the confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Custom color palette
colors = sns.color_palette("Blues")
# Create a bar chart with custom color
ax = plt.subplot()
sns.heatmap(cm, annot=True, ax=ax, fmt="d", cmap=colors)
# Labels and title
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
# Set tick labels for x and y axes
ax.xaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
ax.yaxis.set_ticklabels(['Diabetic', 'Non-diabetic'])
plt.show()
6. Use stochastic Gradient Descent with Cross Validation k=2¶
#1. Create SGD classifier to evaluate performance of predicting diabetes
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_sc, y_train)
SGDClassifier(random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SGDClassifier(random_state=42)
#2. Get Cross-Validated Predictions
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
# Compute cross-validated accuracy
cv_accuracy = cross_val_score(sgd_clf, X_train_sc, y_train, cv=2, scoring='accuracy')
print("Accuracy of 2-folds:", cv_accuracy)
# Get cross-validated predictions
cv_predictions = cross_val_predict(sgd_clf, X_train_sc, y_train, cv=3)
# Compute confusion matrix
confusion_matrix_result = confusion_matrix(y_train, cv_predictions)
print("Confusion Matrix:")
print(confusion_matrix_result)
# Calculate precision
precision_cv = precision_score(y_train, cv_predictions)
print("Cross-Validated Precision:", precision_cv)
# Calculate recall
recall_cv = recall_score(y_train, cv_predictions)
print("Cross-Validated Recall:", recall_cv)
# Calculate F1 score
f1_score_cv = f1_score(y_train, cv_predictions)
print("Cross-Validated F1 Score:", f1_score_cv)
# Calculate mean accuracy
mean_accuracy = cv_accuracy.mean()
print("Mean Accuracy:", mean_accuracy)
Accuracy of 2-folds: [0.73 0.725] Confusion Matrix: [[272 129] [115 284]] Cross-Validated Precision: 0.6876513317191283 Cross-Validated Recall: 0.7117794486215538 Cross-Validated F1 Score: 0.6995073891625616 Mean Accuracy: 0.7275
#3. Create confusion matrices for all 3 - folds and the overall
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
def plot_cross_val_confusion_matrix(model, X, y, cv=2):
# Initialize figure
num_rows = 1
num_cols = cv + 1 # One column for each fold and one for overall
plt.figure(figsize=(5 * num_cols, 5 * num_rows))
# Perform cross-validation predictions
cv_predictions = cross_val_predict(model, X, y, cv=cv)
# Compute overall confusion matrix
overall_cm = confusion_matrix(y, cv_predictions)
# Plot confusion matrix for each fold
for i in range(cv):
fold_predictions = cv_predictions[i*len(y)//cv:(i+1)*len(y)//cv]
fold_true_labels = y[i*len(y)//cv:(i+1)*len(y)//cv]
fold_cm = confusion_matrix(fold_true_labels, fold_predictions)
plt.subplot(num_rows, num_cols, i+1)
sns.heatmap(fold_cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title(f'Fold {i+1}')
plt.xlabel('Predicted Values')
plt.ylabel('True Values')
# Plot overall confusion matrix
plt.subplot(num_rows, num_cols, num_cols)
sns.heatmap(overall_cm, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title('Overall')
plt.xlabel('Predicted values')
plt.ylabel('True values')
plt.tight_layout()
plt.show()
plot_cross_val_confusion_matrix(sgd_clf, X_train_sc, y_train, cv=2)
#4. Plotting ROC-AUC curves for all the 3-folds
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_predict
# Get cross-validated predicted probabilities
y_scores = cross_val_predict(sgd_clf, X_train_sc, y_train, cv=2, method='decision_function')
# Compute ROC curve for each fold
fpr = []
tpr = []
roc_auc = []
for i in range(3): # Number of folds
fpr_fold, tpr_fold, _ = roc_curve(y_train[i*len(y_train)//3:(i+1)*len(y_train)//3],
y_scores[i*len(y_train)//3:(i+1)*len(y_train)//3])
fpr.append(fpr_fold)
tpr.append(tpr_fold)
roc_auc.append(auc(fpr_fold, tpr_fold))
# Compute micro-average ROC curve and ROC area
fpr_micro, tpr_micro, _ = roc_curve(y_train, y_scores)
roc_auc_micro = auc(fpr_micro, tpr_micro)
# Plot ROC curves for each fold
plt.figure(figsize=(8, 6))
for i in range(3):
plt.plot(fpr[i], tpr[i], label=f'ROC Fold {i+1} (AUC = {roc_auc[i]:0.2f})')
# Plot micro-average ROC curve
plt.plot(fpr_micro, tpr_micro, label=f'Micro-average ROC (AUC = {roc_auc_micro:0.2f})', linestyle='--')
plt.plot([0, 1], [0, 1], 'k--') # Diagonal reference line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()